# Data cleaning 
# Article: Misinformation among Migrants
## Author: Daniel Rojas

rm(list = ls())


# set working director to source file location

# Function to install or load packages
install_and_load = function(packages) {
    for (pkg in packages) {
        if (!requireNamespace(pkg, quietly = TRUE)) {
            install.packages(pkg, dependencies = TRUE)
        }
        library(pkg, character.only = TRUE)
    }
}

# Install/load packages
install_and_load(c('tidyverse',
                   'haven'))


# load text data
dt_text = read_csv('Misinformation_MX_April+10,+2024_08.28.csv')[-c(1:2),] 

# load numeric data
dt_num = read_csv('Misinformation_MX_April+10,+2024_08.29.csv')[-c(1:2),] 

# remove pilot data and duplicated respondents
dt_num = dt_num[as.Date(dt_num$StartDate) > as.Date("2024-03-20"), ] %>% 
    filter(!is.na(email)) %>% 
    filter(!duplicated(email)) 

dt_text = dt_text[as.Date(dt_text$StartDate) > as.Date("2024-03-20"), ] %>% 
    filter(!is.na(email)) %>% 
    filter(!duplicated(email)) 

# create variables for descriptives
dt_text_subset = dt_text %>%
    transmute(origin = case_when(country=='Otro, ¿cuál?'~'Other',
                                 country=='Haití'~'Haiti',
                                 country=='Perú'~'Peru',
                                 TRUE ~ country),
              education_descriptive = case_when(education=='Ninguno'~'None',
                                                education=='Primaria'~'Complete primary',
                                                education=='Primaria Incompleta'~'Incomplete primary',
                                                education=='Secundaria Incompleta'~'Incomplete secondary',
                                                education=='Secundaria'~'Complete secondary',
                                                education=='Universidad Incompleta'~'Incomplete college',
                                                education=='Universidad'~'Complete college',
                                                education=='Posgrado'~'Postgraduate degree',
                                                TRUE ~ NA_character_),
              education_clean = case_when(education=='Ninguno'~0,
                                          education=='Primaria Incompleta'~1,
                                          education=='Primaria'~2,
                                          education=='Secundaria Incompleta'~3,
                                          education=='Secundaria'~4,
                                          education=='Universidad Incompleta'~5,
                                          education=='Universidad'~6,
                                          education=='Posgrado'~7,
                                          TRUE ~ NA_real_),
              race_descriptive = case_when(race=='Afrodescendiente'~'Afro',
                                           race=='Indígena'~'Indigenous group',
                                           TRUE ~ 'Other'),
              religion_descriptive = case_when(religion=='Católico'~'Catholic',
                                               religion=='Protestante'
                                               |religion=='Evangélica o pentecostal'~'Protestant',
                                               religion=='Agnóstico o ateo'
                                               |religion=='Ninguna'~'None',
                                               TRUE ~ 'Other'),
              marital_status_descriptive = case_when(marital_status=='Casado'~'Married',
                                                     marital_status=='Unión libre'~'Free union',
                                                     marital_status=='Soltero'~'Single',
                                                     marital_status=='Divorciado'
                                                     |marital_status=='Separado'~'Separated',
                                                     marital_status=='Viudo'~'Widow',
                                                     TRUE ~ NA_character_),
              location_descriptive = case_when(location=='Otra, ¿cuál?'~'Other',
                                               location=='Ciudad de México'~'Mexico City',
                                               TRUE ~ location),
              destination_descriptive = case_when(destination=='Canadá'~'Canada',
                                                  destination=='España'~'Spain',
                                                  destination=='Estados Unidos'~'USA',
                                                  destination=='No sé'~'DK',
                                                  TRUE ~ 'Other'),
              age_num = as.numeric(age_1),
              email,
              cbp_knowledge_descriptive = case_when(digital_literacy_cbp=='Ningún conocimiento'~'No knowledge',
                                                    digital_literacy_cbp=='Poco conocimiento'~'Little knowledge',
                                                    digital_literacy_cbp=='Algún conocimiento'~'Some knowledge',
                                                    digital_literacy_cbp=='Pleno conocimiento'~'Full knowledge',
                                                    TRUE ~ NA_character_),
              cyberbullying1_descriptive = case_when(cyberbullying1_b=='Si'~'Yes',
                                                     cyberbullying1_b=='No se/No respondo'~'DK/DA',
                                                     cyberbullying1_b=='No'~'No',
                                                     TRUE ~ NA_character_),
              cyberbullying2_descriptive = case_when(cyberbullying2_b=='Si'~'Yes',
                                                     cyberbullying2_b=='No se/No respondo'~'DK/DA',
                                                     cyberbullying2_b=='No'~'No',
                                                     TRUE ~ NA_character_),
              cyberbullying3_descriptive = case_when(cyberbullying3_b=='Si'~'Yes',
                                                     cyberbullying3_b=='No se/No respondo'~'DK/DA',
                                                     cyberbullying3_b=='No'~'No',
                                                     TRUE ~ NA_character_),
              cyberbullying4_descriptive = case_when(cyberbullying4_b=='Si'~'Yes',
                                                     cyberbullying4_b=='No se/No respondo'~'DK/DA',
                                                     cyberbullying4_b=='No'~'No',
                                                     TRUE ~ NA_character_),
              cyberbullying5_descriptive = case_when(cyberbullying5_b=='Si'~'Yes',
                                                     cyberbullying5_b=='No se/No respondo'~'DK/DA',
                                                     cyberbullying5_b=='No'~'No',
                                                     TRUE ~ NA_character_),
              cyberbullying6_descriptive = case_when(cyberbullying6_b=='Si'~'Yes',
                                                     cyberbullying6_b=='No se/No respondo'~'DK/DA',
                                                     cyberbullying6_b=='No'~'No',
                                                     TRUE ~ NA_character_),
              outlet_newspaper_descriptive = case_when(info_outlet_newspaper=='Nada'~'Not at all',
                                                       info_outlet_newspaper=='Algo'~'Some',
                                                       info_outlet_newspaper=='Poco'~'A little',
                                                       info_outlet_newspaper=='Mucho'~'A lot',
                                                       TRUE ~ NA_character_),
              outlet_gov_descriptive = case_when(info_outlet_gov=='Nada'~'Not at all',
                                                 info_outlet_gov=='Algo'~'Some',
                                                 info_outlet_gov=='Poco'~'A little',
                                                 info_outlet_gov=='Mucho'~'A lot',
                                                       TRUE ~ NA_character_),
              outlet_fb_descriptive = case_when(info_outlet_fb=='Nada'~'Not at all',
                                                info_outlet_fb=='Algo'~'Some',
                                                info_outlet_fb=='Poco'~'A little',
                                                info_outlet_fb=='Mucho'~'A lot',
                                                       TRUE ~ NA_character_),
              outlet_tt_descriptive = case_when(info_outlet_tt=='Nada'~'Not at all',
                                                info_outlet_tt=='Algo'~'Some',
                                                info_outlet_tt=='Poco'~'A little',
                                                info_outlet_tt=='Mucho'~'A lot',
                                                       TRUE ~ NA_character_),
              outlet_twitter_descriptive = case_when(info_outlet_twitter=='Nada'~'Not at all',
                                                     info_outlet_twitter=='Algo'~'Some',
                                                     info_outlet_twitter=='Poco'~'A little',
                                                     info_outlet_twitter=='Mucho'~'A lot',
                                                       TRUE ~ NA_character_),
              outlet_wa_descriptive = case_when(info_outlet_wa=='Nada'~'Not at all',
                                                info_outlet_wa=='Algo'~'Some',
                                                info_outlet_wa=='Poco'~'A little',
                                                info_outlet_wa=='Mucho'~'A lot',
                                                       TRUE ~ NA_character_))

# merge datasets
dt = merge(dt_num, dt_text_subset)

# remove respondents' email info
dt = dt %>% 
    select(-c('email'))

# create variables for analyses
# recoding function
sharing_recoding = function(variable){
    ifelse(variable==1, 1,0)
}


dt = dt %>% 
    mutate(across(c(motivated_reason_1_1,motivated_reason_2_1,trust_people,
                    value_democracy,digital_literacy_1,digital_literacy_2,
                    digital_literacy_3,digital_literacy_4,digital_literacy_5,
                    digital_literacy_6,digital_literacy_7,digital_literacy_8,
                    digital_literacy_9,digital_literacy_10,digital_literacy_11,
                    digital_literacy_12,digital_literacy_13,digital_literacy_14,
                    digital_literacy_cbp,`timing_c_treatment_Page Submit`,
                    `timing_m_treatment_Page Submit`,attention_check1,attention_check2,
                    online_risk1_b_1,online_risk2_b_1,online_risk3_b_1,
                    online_risk1_e_1,online_risk2_e_1,online_risk3_e_1,
                    cyberbullying1_b,cyberbullying2_b,cyberbullying3_b,
                    cyberbullying4_b,cyberbullying5_b,cyberbullying6_b,
                    info_outlet_gov,info_outlet_fb,info_outlet_tt,info_outlet_twitter,
                    info_outlet_wa,info_source_mx,info_source_us,info_source_ngo,
                    info_source_family,info_source_acq,trust_newspaper,trust_gov,
                    trust_fb,trust_tt,trust_twitter,trust_wa,misinfo_id1_b,misinfo_id2_b,
                    misinfo_id3_b,misinfo_id4_b,misinfo_id5_b,misinfo_id1_e,misinfo_id2_e,
                    misinfo_id3_e,misinfo_id4_e,misinfo_id5_e,misinfo_share1_wa_b,
                    misinfo_share2_wa_b,misinfo_share3_wa_b,misinfo_share4_wa_b,
                    misinfo_share5_wa_b,misinfo_share1_fb_b,misinfo_share2_fb_b,
                    misinfo_share3_fb_b,misinfo_share4_fb_b,misinfo_share5_fb_b,
                    misinfo_share1_wa_e,misinfo_share2_wa_e,misinfo_share3_wa_e,
                    misinfo_share4_wa_e,misinfo_share5_wa_e,misinfo_share1_fb_e,
                    misinfo_share2_fb_e,misinfo_share3_fb_e,misinfo_share4_fb_e,
                    misinfo_share5_fb_e,income,social_desirability,misinfo_recall1_b,
                    misinfo_recall2_b,misinfo_recall3_b,misinfo_recall4_b,
                    misinfo_recall5_b), 
                  as.numeric),
           across(c(misinfo_share1_wa_b,
                    misinfo_share2_wa_b,misinfo_share3_wa_b,misinfo_share4_wa_b,
                    misinfo_share5_wa_b,misinfo_share1_fb_b,misinfo_share2_fb_b,
                    misinfo_share3_fb_b,misinfo_share4_fb_b,misinfo_share5_fb_b,
                    misinfo_share1_wa_e,misinfo_share2_wa_e,misinfo_share3_wa_e,
                    misinfo_share4_wa_e,misinfo_share5_wa_e,misinfo_share1_fb_e,
                    misinfo_share2_fb_e,misinfo_share3_fb_e,misinfo_share4_fb_e,
                    misinfo_share5_fb_e), sharing_recoding),
           # test of design
           timing_misinfo = `timing_m_treatment_Page Submit`/60,
           timing_cyber = `timing_c_treatment_Page Submit`/60,
           attention_check1_binary = ifelse(attention_check1==3, 1,0),
           attention_check2_binary = ifelse(attention_check2==3, 1,0),
           # covariates & subgroups
           female = ifelse(female==1, 1,0),
           income = case_when(income == 99 ~ NA_real_,
                              TRUE ~ income),
           no_education = ifelse(education_clean<2, 1,0), # less than elementary
           low_education = ifelse(education_clean<3, 1,0), # up to elementary
           age = 2024 - age_num,
           catholic = ifelse(religion==1, 1,0),
           kids_above_mean = ifelse(as.numeric(children) > mean(as.numeric(children)), 1,0),
           children_num = as.numeric(children),
           children_binary = ifelse(children_num==0, 0,1),
           single = ifelse(marital_status==1, 1,0),
           location_cat = case_when(location_descriptive %in% c('Ciudad Juárez',
                                                                'Piedras Negras',
                                                                'Mexicali',
                                                                'Matamoros',
                                                                'Tijuana') ~ 'North',
                                    location_descriptive == 'Ciudad de México' ~ 'Mexico City',
                                    location_descriptive == 'Tapachula' ~ 'South'),
           destination_US = ifelse(destination_descriptive=='USA', 1,0),
           motivated_reasoning_arrival = motivated_reason_1_1,
           motivated_reasoning_arrival_binary = ifelse(motivated_reasoning_arrival>mean(motivated_reasoning_arrival), 1,0),
           motivated_reasoning_asylum = motivated_reason_2_1,
           motivated_reasoning_asylum_binary = ifelse(motivated_reasoning_asylum>mean(motivated_reasoning_asylum), 1,0),
           motivated_reasoning_index = rowMeans(cbind(motivated_reasoning_arrival,
                                                      motivated_reasoning_asylum)),
           motivated_reasoning_binary = ifelse(motivated_reasoning_index>mean(motivated_reasoning_index), 1,0),
           trust_people_binary = ifelse(trust_people>2, 1,0), # trust = 1
           value_democracy_binary = ifelse(value_democracy>2, 1,0),
           digital_literacy_pca_index = prcomp(~ digital_literacy_1+digital_literacy_2+digital_literacy_3+digital_literacy_4+
                                               digital_literacy_5+digital_literacy_6+digital_literacy_7+digital_literacy_8+
                                               digital_literacy_9+digital_literacy_10+digital_literacy_11+digital_literacy_12+
                                               digital_literacy_13+digital_literacy_14+digital_literacy_cbp)$x[,1],
           digitial_literacy_index = rowMeans(cbind(digital_literacy_1,digital_literacy_2,digital_literacy_3,digital_literacy_4,
                                                   digital_literacy_5,digital_literacy_6,digital_literacy_7,digital_literacy_8,
                                                   digital_literacy_9,digital_literacy_10,digital_literacy_11,digital_literacy_12,
                                                   digital_literacy_13,digital_literacy_14,digital_literacy_cbp)),
           digital_literacy_binary = ifelse(digitial_literacy_index>mean(digitial_literacy_index), 1,0),
           # Treatments
           cybersecurity_treatment = case_when(Treatment_cybersecurity_DO=='timing_c_treatment|cybersec_t0' ~ 0,
                                               Treatment_cybersecurity_DO=='timing_c_treatment|cybersec_t1' ~ 1,
                                               TRUE ~ NA_real_),
           misinformation_treatment = case_when(str_detect(Treatment_misinformation_DO, 't0') ~ 0,
                                                str_detect(Treatment_misinformation_DO, 't1') ~ 1,
                                                str_detect(Treatment_misinformation_DO, 't2') ~ 2,
                                                TRUE ~ NA_real_) %>% as.factor,
           misinformation_binary = ifelse(misinformation_treatment==0, 0,1),
           misinformation_treatment_tips = case_when(misinformation_treatment==0 ~ 0,
                                                     misinformation_treatment==1 ~ 1,
                                                     TRUE ~ NA_real_),
           misinformation_treatment_tips_ex = case_when(misinformation_treatment==0 ~ 0,
                                                     misinformation_treatment==2 ~ 1,
                                                     TRUE ~ NA_real_),
           misinformation_att = case_when(misinformation_treatment==1 ~ 0,
                                          misinformation_treatment==2 ~ 1,
                                          TRUE ~ NA_real_) %>% as.factor,
           num_fake_posts = 3,
           # Pre-treatment outcomes
           ## recall
           recall1_b = ifelse(misinfo_recall1_b==1, 1,0),
           recall2_b = ifelse(misinfo_recall2_b==1, 1,0),
           recall3_b = ifelse(misinfo_recall3_b==1, 1,0),
           exposure_misinfo_b = rowMeans(cbind(recall1_b,recall3_b,recall3_b)),
           post_1_correct_b = ifelse(misinfo_id1_b==1, 1,0),
           post_2_correct_b = ifelse(misinfo_id2_b==1, 1,0),
           post_3_correct_b = ifelse(misinfo_id3_b==1, 1,0),
           post_4_correct_b = ifelse(misinfo_id4_b==0, 1,0),
           post_5_correct_b = ifelse(misinfo_id5_b==0, 1,0),
           accuracy_rate_b = rowMeans(cbind(post_1_correct_b,post_2_correct_b,
                                            post_3_correct_b,post_4_correct_b,
                                            post_5_correct_b)),
           a_ratings_false_b = rowMeans(cbind(post_1_correct_b,post_2_correct_b,
                                              post_3_correct_b)),
           a_ratings_true_b = rowMeans(cbind(post_4_correct_b,post_5_correct_b)),
           accuracy_discernment_b = a_ratings_true_b - a_ratings_false_b,
           post_1_verify_b = ifelse(misinfo_verify1_b==1, 1,0),
           post_2_verify_b = ifelse(misinfo_verify2_b==1, 1,0),
           post_3_verify_b = ifelse(misinfo_verify3_b==1, 1,0),
           post_4_verify_b = ifelse(misinfo_verify4_b==1, 1,0),
           post_5_verify_b = ifelse(misinfo_verify5_b==1, 1,0),
           verify_fake_b = rowSums(cbind(post_1_verify_b,post_2_verify_b,post_3_verify_b)),
           verification_tendency_rate_b = rowMeans(cbind(post_1_verify_b,post_2_verify_b,
                                                         post_3_verify_b,post_4_verify_b,
                                                         post_5_verify_b)),
           accuracy_verification_tendency_rate_b = verify_fake_b/num_fake_posts,
           dissemination_f_b = rowMeans(cbind(misinfo_share1_wa_b,
                                             misinfo_share2_wa_b,
                                             misinfo_share3_wa_b,
                                             misinfo_share1_fb_b,
                                             misinfo_share2_fb_b,
                                             misinfo_share3_fb_b)),
           dissemination_r_b = rowMeans(cbind(misinfo_share4_wa_b,
                                             misinfo_share5_wa_b,
                                             misinfo_share4_fb_b,
                                             misinfo_share5_fb_b)),
           sharing_discernment_b = dissemination_r_b - dissemination_f_b,
           online_risk1_b_1 = 10 - online_risk1_b_1,
           online_risk2_b_1 = 10 - online_risk2_b_1,
           online_risk_index_b = rowMeans(cbind(online_risk1_b_1,online_risk2_b_1,online_risk3_b_1)),
           online_risk_index_binary_b = ifelse(online_risk_index_b>mean(online_risk_index_b), 1,0),
           online_risk_index2_b = rowMeans(cbind(online_risk1_b_1,online_risk3_b_1)),
           online_risk_index2_binary_b = ifelse(online_risk_index2_b>mean(online_risk_index2_b), 1,0),
           # Post-treatment outcomes 
           trust_newspaper_binary = ifelse(trust_newspaper>2, 1,0),
           trust_gov_binary = ifelse(trust_gov>2, 1,0),
           trust_fb_binary = ifelse(trust_fb>2, 1,0),
           trust_tt_binary = ifelse(trust_tt>2, 1,0),
           trust_twitter_binary = ifelse(trust_twitter>2, 1,0),
           trust_wa_binary = ifelse(trust_wa>2, 1,0),
           trust_sources_index = rowMeans(cbind(trust_newspaper,trust_gov,
                                                trust_fb,trust_tt,trust_twitter,trust_wa)),
           trust_sources_index_binary = rowMeans(cbind(trust_newspaper_binary,
                                                       trust_gov_binary,
                                                       trust_fb_binary,
                                                       trust_tt_binary,
                                                       trust_twitter_binary,
                                                       trust_wa_binary)),
           post_1_correct_e = ifelse(misinfo_id1_e==1, 1,0),
           post_2_correct_e = ifelse(misinfo_id2_e==1, 1,0),
           post_3_correct_e = ifelse(misinfo_id3_e==1, 1,0), 
           post_4_correct_e = ifelse(misinfo_id4_e==0, 1,0),
           post_5_correct_e = ifelse(misinfo_id5_e==0, 1,0), 
           accuracy_rate_e = rowMeans(cbind(post_1_correct_e,post_2_correct_e,
                                            post_3_correct_e,post_4_correct_e,
                                            post_5_correct_e)),
           # r&r request: _discernment
           a_ratings_false_e = 1- rowMeans(cbind(post_1_correct_e,post_2_correct_e,
                                          post_3_correct_e)),
           a_ratings_true_e = rowMeans(cbind(post_4_correct_e,post_5_correct_e)),
           accuracy_discernment_e = a_ratings_true_e - a_ratings_false_e,
           post_1_verify_e = ifelse(misinfo_verify1_e==1, 1,0),
           post_2_verify_e = ifelse(misinfo_verify2_e==1, 1,0),
           post_3_verify_e = ifelse(misinfo_verify3_e==1, 1,0),
           post_4_verify_e = ifelse(misinfo_verify4_e==1, 1,0),
           post_5_verify_e = ifelse(misinfo_verify5_e==1, 1,0),
           verify_fake_e = rowSums(cbind(post_1_verify_e,post_2_verify_e,post_3_verify_e)),
           verification_tendency_rate_e = rowMeans(cbind(post_1_verify_e,post_2_verify_e,
                                                         post_3_verify_e,post_4_verify_e,
                                                         post_5_verify_e)),
           accuracy_verification_tendency_rate_e = verify_fake_e/num_fake_posts,
           dissemination_f_e = rowMeans(cbind(misinfo_share1_wa_e,
                                             misinfo_share2_wa_e,
                                             misinfo_share3_wa_e,
                                             misinfo_share1_fb_e,
                                             misinfo_share2_fb_e,
                                             misinfo_share3_fb_e)),
           dissemination_r_e = rowMeans(cbind(misinfo_share4_wa_e,
                                             misinfo_share5_wa_e,
                                             misinfo_share4_fb_e,
                                             misinfo_share5_fb_e)),
           sharing_discernment_e = dissemination_r_e - dissemination_f_e,
           online_risk1_e_1 = 10 - online_risk1_e_1,
           online_risk2_e_1 = 10 - online_risk2_e_1,
           online_risk_index_e = rowMeans(cbind(online_risk1_e_1,online_risk2_e_1,online_risk3_e_1),
                                          na.rm = T),
           online_risk_index_binary_e = ifelse(online_risk_index_e>mean(online_risk_index_e, na.rm = T), 1,0),
           online_risk_index2_e = rowMeans(cbind(online_risk1_e_1,online_risk3_e_1),
                                           na.rm = T),
           online_risk_index2_binary_e = ifelse(online_risk_index2_e>mean(online_risk_index2_e, na.rm = T), 1,0),
           online_measures_tmp = str_remove_all(online_measures, ',') %>% str_detect(., '[456]'),
           safety_measures_e = ifelse(online_measures_tmp==TRUE, 0,1)
           )



# Saving data
saveRDS(dt, 'clean_data_MX.rds')

